This is a House Price Prediction Using Linear Regression Model
At first we install all the required packages for the linear regression.
install.packages('readr')
install.packages('ggplot2')
install.packages('mlbench')
install.packages('corrplot')
install.packages('Amelia')
install.packages('caret')
install.packages('plotly')
install.packages('caTools')
install.packages('reshape2')
install.packages('dplyr')
library(readr)
library(ggplot2)
library(corrplot)
library(mlbench)
library(Amelia)
library(plotly)
library(reshape2)
library(caret)
library(caTools)
library(dplyr)
We input the cleaned dataset
data(Housing)
Warning in data(Housing) : data set ‘Housing’ not found
housing <- Housing
corrplot(cor(select(housing,-chas)))
housing %>%
ggplot(aes(medv)) +
stat_density() +
theme_bw()
ggplotly(housing %>%
ggplot(aes(medv)) +
stat_density() +
theme_bw())
housing %>%
select(c(crim, rm, age, rad, tax, lstat, medv,indus,nox,ptratio,zn)) %>%
melt(id.vars = "medv") %>%
ggplot(aes(x = value, y = medv, colour = variable)) +
geom_point(alpha = 0.7) +
stat_smooth(aes(colour = "black")) +
facet_wrap(~variable, scales = "free", ncol = 2) +
labs(x = "Variable Value", y = "Median House Price ($1000s)") +
theme_minimal()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
pseudoinverse used at -0.5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
neighborhood radius 13
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
reciprocal condition number 4.5194e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
There are other near singularities as well. 156.25
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
pseudoinverse used at -0.5
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
neighborhood radius 13
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
reciprocal condition number 4.5194e-15
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
There are other near singularities as well. 156.25
set.seed(123)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)
model <- lm(medv ~ crim + rm + tax + lstat , data = train)
summary(model)
Call:
lm(formula = medv ~ crim + rm + tax + lstat, data = train)
Residuals:
Min 1Q Median
-16.849 -3.267 -1.009
3Q Max
2.147 29.690
Coefficients:
Estimate
(Intercept) -4.320521
crim -0.077099
rm 5.608630
tax -0.004938
lstat -0.494856
Std. Error
(Intercept) 3.588604
crim 0.036376
rm 0.495616
tax 0.002070
lstat 0.058225
t value Pr(>|t|)
(Intercept) -1.204 0.2294
crim -2.120 0.0347
rm 11.316 < 2e-16
tax -2.386 0.0176
lstat -8.499 5.27e-16
(Intercept)
crim *
rm ***
tax *
lstat ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01
‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.203 on 357 degrees of freedom
Multiple R-squared: 0.6754, Adjusted R-squared: 0.6718
F-statistic: 185.7 on 4 and 357 DF, p-value: < 2.2e-16
res <- residuals(model)
res <- as.data.frame(res)
ggplot(res,aes(res)) + geom_histogram(fill='blue',alpha=0.5)
`stat_bin()` using `bins =
30`. Pick better value with
`binwidth`.
plot(model)
test$predicted.medv <- predict(model,test)
pl1 <-test %>%
ggplot(aes(medv,predicted.medv)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(pl1)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
error <- test$medv-test$predicted.medv
rmse <- sqrt(mean(error)^2)